Read Data In

college_stats <- read_csv("raw data/cfbd_stats_season_player.csv")
teams <- read_csv("raw data/cfbd_team_info.csv")

draft_info <- read_csv("raw data/cfbd_draft_picks.csv")
draft_combine <- read_csv("raw data/load_combine.csv")
draft_nfl <- read_csv("raw data/load_draft_picks.csv")
rosters <- read_csv("raw data/load_rosters_weekly.csv")

Clean & Prepare College Data

Number of colleges attended

num_schools <- college_stats %>% 
  group_by(athlete_id) %>% 
  summarize(num_schools = n_distinct(team))

Check missing data

gg_miss_var(college_stats)

missing_all <- college_stats %>% 
  select(-c("year", "team", "conference", "athlete_id", "player", "position")) %>% 
  filter(if_all(everything(), is.na)) 

0 observations missing all, so less concerned.

Filter to just player’s last year in dataset/college

senior_stats <- college_stats %>% 
  group_by(athlete_id) %>% 
  filter(year == max(year, na.rm = TRUE)) %>% 
  ungroup() %>% 
  left_join(num_schools, by = "athlete_id")

Modify position variables

senior_stats <- senior_stats %>% 
  mutate(position = ifelse(position %in% c("?", "ATH"), NA, position),
         side_of_ball = case_when(position %in% c("WR", "TE", "RB", "QB", "FB",
                                                  "OL", "C", "OT", "G") ~ "Off",
                                  position %in% c("DB", "CB", "S", "LB", "DL", 
                                                  "DE", "ILB", "OLB", "DT", 
                                                  "NT", "EDGE") ~ "Def",
                                  position %in% c("PK", "P", "LS", "PR") ~ "ST",
                                  TRUE ~ NA),
         position_group = case_when(position %in% c("WR") ~ "WR",
                                    position %in% c("DB", "CB", "S") ~ "DB",
                                    position %in% c("PK", "P", "LS", "PR") ~ "ST",
                                    position %in% c("TE") ~ "TE",
                                    position %in% c("RB", "FB") ~ "RB",
                                    position %in% c("LB", "ILB", "OLB") ~ "LB",
                                    position %in% c("DL", "DE", "DT", "NT", 
                                                    "EDGE") ~ "DL",
                                    position %in% c("QB") ~ "QB",
                                    position %in% c("OL", "C", "OT", "G") ~ "OL",
                                    TRUE ~ NA))

Check players who show up more than once

check_dups <- senior_stats %>% # 7 players show up twice in senior_stats
  count(athlete_id) %>% 
  filter(n > 1) %>% 
  select(athlete_id) 

check_dups
## # A tibble: 7 × 1
##   athlete_id
##        <dbl>
## 1    4037217
## 2    4243817
## 3    4362128
## 4    4430675
## 5    4431348
## 6    4570184
## 7    4576147
senior_stats %>% filter(athlete_id %in% check_dups$athlete_id) 
## # A tibble: 14 × 63
##     year team          conference athlete_id player position passing_completions
##    <dbl> <chr>         <chr>           <dbl> <chr>  <chr>                  <dbl>
##  1  2021 Old Dominion  Conferenc…    4037217 Brand… TE                        NA
##  2  2021 Wake Forest   ACC           4037217 Brand… TE                        NA
##  3  2021 Florida Atla… Conferenc…    4243817 Diash… CB                        NA
##  4  2021 UTEP          Conferenc…    4243817 Diash… CB                        NA
##  5  2021 UCF           American …    4362128 Justi… LB                        NA
##  6  2021 UConn         FBS Indep…    4362128 Justi… LB                        NA
##  7  2021 Oklahoma      Big 12        4430675 Andre… RB                        NA
##  8  2021 Oklahoma Sta… Big 12        4430675 Andre… RB                        NA
##  9  2021 Arizona       Pac-12        4431348 Malik… LB                        NA
## 10  2021 Colorado      Pac-12        4431348 Malik… LB                        NA
## 11  2021 Fresno State  Mountain …    4570184 Evan … PK                        NA
## 12  2021 UNLV          Mountain …    4570184 Evan … PK                        NA
## 13  2021 Old Dominion  Conferenc…    4576147 Colt … OL                        NA
## 14  2021 Western Kent… Conferenc…    4576147 Colt … OL                        NA
## # ℹ 56 more variables: passing_att <dbl>, passing_pct <dbl>, passing_yds <dbl>,
## #   passing_td <dbl>, passing_int <dbl>, passing_ypa <dbl>, rushing_car <dbl>,
## #   rushing_yds <dbl>, rushing_td <dbl>, rushing_ypc <dbl>, rushing_long <dbl>,
## #   receiving_rec <dbl>, receiving_yds <dbl>, receiving_td <dbl>,
## #   receiving_ypr <dbl>, receiving_long <dbl>, fumbles_fum <dbl>,
## #   fumbles_rec <dbl>, fumbles_lost <dbl>, defensive_solo <dbl>,
## #   defensive_tot <dbl>, defensive_tfl <dbl>, defensive_sacks <dbl>, …
## these players all marked as playing for an opponent in >= 1 game

senior_stats <- senior_stats[!(senior_stats$athlete_id == "4037217" & 
                                 senior_stats$team == "Old Dominion"), ] 
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4243817" & 
                                 senior_stats$team == "UTEP"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4362128" & 
                                 senior_stats$team == "UCF"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4430675" & 
                                 senior_stats$team == "Oklahoma"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4431348" & 
                                 senior_stats$team == "Colorado"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4570184" & 
                                 senior_stats$team == "Fresno State"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4576147" & 
                                 senior_stats$team == "Old Dominion"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4680602" &
                                 senior_stats$team == "Kentucky"), ]

check_dups <- senior_stats %>% # 7 players show up twice in senior_stats
  count(athlete_id) %>% 
  filter(n > 1) %>% 
  select(athlete_id) 
check_dups
## # A tibble: 0 × 1
## # ℹ 1 variable: athlete_id <dbl>

Create Separate Dfs for each position

They have very different statistics

Drop PRs (<300 and they have varying stats)

senior_stats <- senior_stats %>% 
  filter(position != "PR")

def_stats <- c("fumbles_rec", "defensive_solo", "defensive_tot", "defensive_tfl",
               "defensive_sacks", "defensive_qb_hur", "interceptions_int", 
               "interceptions_yds", "interceptions_avg", "interceptions_td",
               "defensive_pd", "defensive_td")

qb_stats <-  c("passing_completions", "passing_att", "passing_pct", "passing_yds",
               "passing_td", "passing_int", "passing_ypa", "rushing_car", 
               "rushing_yds", "rushing_td", "rushing_ypc", "rushing_long",
               "fumbles_fum", "fumbles_rec", "fumbles_lost")

wr_rb_stats <- c("rushing_car", "rushing_yds", "rushing_td", "rushing_ypc", 
                 "rushing_long", "receiving_rec", "receiving_yds", "receiving_td",
                 "receiving_ypr", "receiving_long", "fumbles_fum", "fumbles_rec", 
                 "fumbles_lost", "punt_returns_no", "punt_returns_yds",
                 "punt_returns_avg", "punt_returns_td", "punt_returns_long")

st_stats <- c("kicking_fgm", "kicking_fga", "kicking_pct", "kicking_xpa",
              "kicking_xpm", "kicking_pts", "kicking_long", "punting_no",
              "punting_yds", "punting_ypp", "punting_long", "punting_in_20",
              "punting_tb")

def_stats_df <- senior_stats %>% 
  filter(side_of_ball == "Def") %>% 
  select(year, team, conference, athlete_id, player, position, position_group,
         all_of(def_stats))

qb_stats_df <- senior_stats %>% 
  filter(position == "QB") %>% 
  select(year, team, conference, athlete_id, player, position, position_group,
         all_of(qb_stats))

wr_rb_stats_df <- senior_stats %>% 
  filter(position %in% c("WR", "TE", "RB", "FB")) %>% 
  select(year, team, conference, athlete_id, player, position, position_group,
         all_of(wr_rb_stats))

st_stats_df <- senior_stats %>% 
  filter(side_of_ball == "ST") %>% 
  select(year, team, conference, athlete_id, player, position, position_group,
         all_of(st_stats))

Create above/below/at average flags for each position

get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

def_stats_avg <- def_stats_df %>% 
  group_by(position) %>% 
  select(-year, -team, -conference, -athlete_id, -player, -position_group) %>% 
  summarise(across(everything(), ~mean(.x, na.rm = TRUE)))

qb_stats_avg <- qb_stats_df %>% 
  group_by(position) %>% 
  select(-year, -team, -conference, -athlete_id, -player, -position_group) %>% 
  summarise(across(everything(), ~mean(.x, na.rm = TRUE)))

wr_rb_stats_avg <- wr_rb_stats_df %>% 
  group_by(position) %>% 
  select(-year, -team, -conference, -athlete_id, -player, -position_group) %>% 
  summarise(across(everything(), ~mean(.x, na.rm = TRUE)))

st_stats_avg <- st_stats_df %>% 
  group_by(position) %>% 
  select(-year, -team, -conference, -athlete_id, -player, -position_group) %>% 
  summarise(across(everything(), ~mean(.x, na.rm = TRUE)))


st_flags <- st_stats_df %>%
  left_join(st_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
  mutate(across(
    all_of(st_stats),
    ~ case_when(
        .x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
        .x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
        TRUE ~ "at"
      ),
    .names = "{.col}_flag"
  ))  %>% 
  select(athlete_id, position, ends_with("_flag")) %>% 
  rowwise() %>%
  mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
  ungroup()
  
wr_rb_flags <- wr_rb_stats_df %>%
  left_join(wr_rb_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
  mutate(across(
    all_of(wr_rb_stats),
    ~ case_when(
        .x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
        .x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
        TRUE ~ "at"
      ),
    .names = "{.col}_flag"
  )) %>% 
  select(athlete_id, position, ends_with("_flag")) %>% 
  rowwise() %>%
  mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
  ungroup()

def_flags <- def_stats_df %>%
  left_join(def_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
  mutate(across(
    all_of(def_stats),
    ~ case_when(
        .x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
        .x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
        TRUE ~ "at"
      ),
    .names = "{.col}_flag"
  )) %>% 
  select(athlete_id, position, ends_with("_flag")) %>% 
  rowwise() %>%
  mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
  ungroup()

qb_flags <- qb_stats_df %>%
  left_join(qb_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
  mutate(across(
    all_of(qb_stats),
    ~ case_when(
        .x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
        .x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
        TRUE ~ "at"
      ),
    .names = "{.col}_flag"
  )) %>% 
  select(athlete_id, position, ends_with("_flag")) %>% 
  rowwise() %>%
  mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
  ungroup()

senior_stats <- senior_stats %>%
  left_join(qb_flags %>% select(athlete_id, 
                                pos_skill_flag_qb = pos_skill_flag),
            by = "athlete_id") %>% 
  left_join(wr_rb_flags %>% select(athlete_id, 
                                   pos_skill_flag_wr_rb = pos_skill_flag),
            by = "athlete_id") %>% 
  left_join(def_flags %>% select(athlete_id, 
                                 pos_skill_flag_def = pos_skill_flag),
            by = "athlete_id") %>% 
  left_join(st_flags %>% select(athlete_id, 
                                pos_skill_flag_st = pos_skill_flag),
            by = "athlete_id") %>%
  mutate(pos_skill_flag = coalesce(pos_skill_flag_qb, pos_skill_flag_wr_rb,
                                   pos_skill_flag_def, pos_skill_flag_st)) %>%
  select(-pos_skill_flag_qb, -pos_skill_flag_wr_rb, -pos_skill_flag_def,
         -pos_skill_flag_st)

Create competition level flag

Account for players playing at schools with different talent pools, say Ohio State vs Villanova

d3_conf <- teams %>% 
  filter(classification == "iii") %>% 
  distinct(conference) %>%
  pull(conference)

d2_conf <- teams %>% 
  filter(classification == "ii") %>% 
  distinct(conference) %>% 
  pull(conference)

fcs_conf <- teams %>%
  filter(classification == "fcs") %>% 
  distinct(conference) %>% 
  pull(conference)

fbs_conf <- teams %>%
  filter(classification == "fbs") %>% 
  distinct(conference) %>% 
  pull(conference)

senior_stats <- senior_stats %>% 
  mutate(div_level = case_when(
    conference %in% fbs_conf ~ "D1 FBS",
    conference %in% fcs_conf ~ "D1 FCS",
    conference %in% d2_conf ~ "D2",
    conference %in% d3_conf ~ "D3",
    TRUE ~ NA),
  comp_level = case_when(
    conference %in% c("Pac-12", "Big 12", "Big Ten", "ACC", "SEC") ~ "P5",
    team == "Notre Dame" ~ "P5",
    conference %in% c("American Athletic", "Conference USA", "Mid-American",
                      "Mountain West", "Sun Belt") ~ "G5",
    team %in% c("Army", "BYU", "Massachusetts", "Liberty", "New Mexico State",
                "UConn", "Navy") ~ "G5",
    conference %in% c("MVFC", "CAA", "Big Sky") ~ "FCS Power",
    conference %in% c("UAC", "SWAC", "Ivy", "Patriot", "Pioneer", "NEC",
                      "Big South-OVC", "Southern", "MEAC", "Southland",
                      "FCS Independents", "Big South", "OVC", "AWC",
                      "Western Athletic", "Atlantic Sun") ~ "FCS",
    div_level == "D2" ~ "D2",
    div_level == "D3" ~ "D3",
    TRUE ~ NA))

Recheck missing in cfb df

analytic_v1 <- senior_stats %>% 
  select(year, team, conference, athlete_id, player, position, num_schools,
         side_of_ball, position_group, div_level, comp_level, pos_skill_flag)

missing_flag <- analytic_v1 %>% 
  filter(is.na(pos_skill_flag)) ## all OL so will drop

analytic_v1 <- analytic_v1 %>%
  filter(position_group != "OL")

gg_miss_var(analytic_v1)

Save college analytical file

write_csv(analytic_v1, "data processing/cfb_analytic_v1.csv")

Now NFL

Filter rosters to just Active Rookies during the regular season & relevant info

rookies <- rosters %>% 
  filter(season == rookie_year, game_type == "REG", status == "ACT",
         !(position %in% c("LS", "T", "G", "C", "OL"))) %>% 
  select(season, team, position, status, full_name, first_name, last_name, 
         birth_date, height, weight, gsis_id, espn_id, week, draft_club, 
         draft_number)

Check for missing IDs

missing_ids <- rookies %>% 
  filter(is.na(espn_id)) %>% 
  distinct(full_name, season, team)

## some NA IDs we have college info for, some we don't

Calculate num weeks active & basic success metric

weeks_active <- rookies %>% 
  group_by(gsis_id, espn_id, team) %>% 
  summarize(weeks_active_team = n(), .groups = "drop") %>%
  group_by(gsis_id, espn_id) %>% 
  arrange(desc(weeks_active_team), .by_group = TRUE) %>% 
  mutate(weeks_active = sum(weeks_active_team),
         num_teams = n_distinct(team),
         primary_team = first(team),
         multiple_teams = ifelse(num_teams > 1, 1, 0)) %>% 
  slice_head() %>% 
  ungroup() %>% 
  select(gsis_id, espn_id, primary_team, weeks_active, multiple_teams)

rookie_success_v1 <- rookies %>% 
  distinct(gsis_id) %>%
  left_join(weeks_active, by = "gsis_id") %>% 
  mutate(success_v1 = ifelse(weeks_active > 8, 1, 0))

rookie_success_v1 <- rookie_success_v1 %>% 
  left_join(rookies %>% select(gsis_id, season, full_name, position), 
            by = "gsis_id") %>%
  mutate(name_id = tolower(full_name)) %>% 
  distinct(gsis_id, weeks_active, .keep_all = TRUE) %>% 
  rename(primary_roster_team = primary_team)

missing_ids <- rookie_success_v1 %>% 
  filter(is.na(espn_id)) 

Merge draft, combine, and success dfs

draft_combine <- draft_combine %>% 
  select(season, draft_year, draft_team, draft_round, draft_ovr, pfr_id, 
         player_name, ht, wt, forty, bench, vertical, broad_jump, cone, 
         shuttle)

ids <- draft_nfl %>% 
  select(gsis_id, pfr_player_id, round, pick, age, dr_av, team)

full_draft <- ids %>% 
  mutate(gsis_id = ifelse(pfr_player_id == "YounBy01", "00-0039137", gsis_id))

draft_combine_rookie_success_v1 <- rookie_success_v1 %>% 
  left_join(full_draft, by = c("gsis_id")) %>%
  filter(!is.na(gsis_id)) 

Check for any duplicate gsis_ids

full_draft %>% 
  group_by(gsis_id) %>% 
  count(gsis_id) %>%
  filter(n > 1)
## # A tibble: 1 × 2
## # Groups:   gsis_id [1]
##   gsis_id     n
##   <chr>   <int>
## 1 <NA>      573
draft_combine_rookie_success_v1 %>% 
  group_by(gsis_id) %>% 
  count(gsis_id) %>% 
  filter(n > 1)
## # A tibble: 0 × 2
## # Groups:   gsis_id [0]
## # ℹ 2 variables: gsis_id <chr>, n <int>

Rename relevant draft columns

draft_combine_rookie_success_v1 <- draft_combine_rookie_success_v1 %>% 
  rename(draft_round = round,
         draft_pick = pick,
         draft_team = team)

Fix teams for consistency

draft_combine_rookie_success_v1 <- draft_combine_rookie_success_v1 %>%
  mutate(
    primary_roster_team = case_when(
      primary_roster_team == "OAK" ~ "LV",
      primary_roster_team == "SD" ~ "LAC",
      primary_roster_team == "BLT" ~ "BAL",
      primary_roster_team == "SL" ~ "LA",
      primary_roster_team == "HST" ~ "HOU",
      primary_roster_team == "CLV" ~ "CLE",
      primary_roster_team == "ARZ" ~ "ARI",
      TRUE ~ primary_roster_team))

save nfl analytical file

write_csv(draft_combine_rookie_success_v1, 
          "data processing/nfl_analytic_v1.csv")

Remove irrelevant dfs & rename current

college <- analytic_v1
nfl <- draft_combine_rookie_success_v1

rm(list = setdiff(ls(), c("college", "nfl")))

NFL Team Data

Read team schedule data in

team_schedules <- read_csv("raw data/load_schedules.csv")

Filter to regular season, Create home/away results

home_results <- team_schedules %>%
  filter(game_type == "REG") %>%
  mutate(
    result = case_when(
      home_score > away_score ~ "W",
      home_score < away_score ~ "L",
      home_score == away_score ~ "T"
    ),
    team = home_team,
    opponent = away_team
  ) %>%
  select(season, week, team, opponent, result, home_score, away_score)

away_results <- team_schedules %>%
  filter(game_type == "REG") %>% 
  mutate(
    result = case_when(
      away_score > home_score ~ "W",
      away_score < home_score ~ "L",
      away_score == home_score ~ "T"
    ),
    team = away_team,
    opponent = home_team
  ) %>%
  select(season, week, team, opponent, result, away_score, home_score)

count(home_results, result)
## # A tibble: 3 × 2
##   result     n
##   <chr>  <int>
## 1 L       1295
## 2 T         10
## 3 W       1574
count(away_results, result)
## # A tibble: 3 × 2
##   result     n
##   <chr>  <int>
## 1 L       1574
## 2 T         10
## 3 W       1295

Calculate each team’s number of wins & losses per season (by home/away team)

home_team_records <- home_results %>%
  group_by(season, team) %>%
  summarise(
    wins = sum(result == "W"),
    losses = sum(result == "L"),
    ties = sum(result == "T")
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
away_team_records <- away_results %>%
  group_by(season, team) %>%
  summarise(
    wins = sum(result == "W"),
    losses = sum(result == "L"),
    ties = sum(result == "T")
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.

Combine home and away into full season for each team

team_records <- home_team_records %>%
  full_join(away_team_records, by = c("season", "team"), 
            suffix = c("_home", "_away")) %>%
  mutate(
    wins = coalesce(wins_home, 0) + coalesce(wins_away, 0),
    losses = coalesce(losses_home, 0) + coalesce(losses_away, 0),
    ties = coalesce(ties_home, 0) + coalesce(ties_away, 0)
  ) %>%
  select(season, team, wins, losses, ties)

Verify the number of games played each season

team_records %>% 
  mutate(total_games = wins + losses + ties) %>%
  filter(total_games != 16 & season %in% c(2015:2020))
## # A tibble: 0 × 6
## # ℹ 6 variables: season <dbl>, team <chr>, wins <dbl>, losses <dbl>,
## #   ties <dbl>, total_games <dbl>
team_records %>% 
  mutate(total_games = wins + losses + ties) %>% 
  filter(total_games != 17 & season %in% c(2021:2024)) 
## # A tibble: 2 × 6
##   season team   wins losses  ties total_games
##    <dbl> <chr> <dbl>  <dbl> <dbl>       <dbl>
## 1   2022 BUF      13      3     0          16
## 2   2022 CIN      12      4     0          16

BUF @ CIN game was cancelled in 2022 so 16 is valid for them

Calculate win percentage & save

team_wp <- team_records %>% 
  mutate(win_percentage = wins / (wins + losses + ties), 
         team = case_when(
           team == "OAK" ~ "LV",
           team == "SD" ~ "LAC",
           team == "STL" ~ "LA", 
           TRUE ~ team
         ))

write_csv(team_wp, "data processing/nfl_team_win_percentage.csv")

Start basic analysis

explore data

head(college) %>% print(width = Inf)
## # A tibble: 6 × 12
##    year team          conference    athlete_id player           position
##   <dbl> <chr>         <chr>              <dbl> <chr>            <chr>   
## 1  2014 Air Force     Mountain West     109907 Jon Lee          RB      
## 2  2014 Akron         Mid-American      147386 Josh Smith       WR      
## 3  2014 Norfolk State MEAC             2026567 Damian Smith     WR      
## 4  2014 Nebraska      Big Ten          3116088 Byerson Cockrell S       
## 5  2014 West Virginia Big 12           3116459 Dravon Henry     S       
## 6  2014 Louisville    ACC              3116652 Chris Miele      WR      
##   num_schools side_of_ball position_group div_level comp_level pos_skill_flag
##         <int> <chr>        <chr>          <chr>     <chr>      <chr>         
## 1           1 Off          RB             D1 FBS    G5         below         
## 2           1 Off          WR             D1 FBS    G5         at            
## 3           1 Off          WR             D1 FCS    FCS        at            
## 4           1 Def          DB             D1 FBS    P5         at            
## 5           1 Def          DB             D1 FBS    P5         at            
## 6           1 Off          WR             D1 FBS    P5         at
str(college)
## tibble [47,428 × 12] (S3: tbl_df/tbl/data.frame)
##  $ year          : num [1:47428] 2014 2014 2014 2014 2014 ...
##  $ team          : chr [1:47428] "Air Force" "Akron" "Norfolk State" "Nebraska" ...
##  $ conference    : chr [1:47428] "Mountain West" "Mid-American" "MEAC" "Big Ten" ...
##  $ athlete_id    : num [1:47428] 109907 147386 2026567 3116088 3116459 ...
##  $ player        : chr [1:47428] "Jon Lee" "Josh Smith" "Damian Smith" "Byerson Cockrell" ...
##  $ position      : chr [1:47428] "RB" "WR" "WR" "S" ...
##  $ num_schools   : int [1:47428] 1 1 1 1 1 1 1 1 1 1 ...
##  $ side_of_ball  : chr [1:47428] "Off" "Off" "Off" "Def" ...
##  $ position_group: chr [1:47428] "RB" "WR" "WR" "DB" ...
##  $ div_level     : chr [1:47428] "D1 FBS" "D1 FBS" "D1 FCS" "D1 FBS" ...
##  $ comp_level    : chr [1:47428] "G5" "G5" "FCS" "P5" ...
##  $ pos_skill_flag: chr [1:47428] "below" "at" "at" "at" ...
skim(college)
Data summary
Name college
Number of rows 47428
Number of columns 12
_______________________
Column type frequency:
character 9
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
team 0 1 3 26 0 306 0
conference 0 1 3 27 0 44 0
player 0 1 5 31 0 45287 0
position 0 1 1 4 0 19 0
side_of_ball 0 1 2 3 0 3 0
position_group 0 1 2 2 0 8 0
div_level 0 1 2 6 0 4 0
comp_level 0 1 2 9 0 6 0
pos_skill_flag 0 1 2 5 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2020.51 3.23 2014 2018 2021 2024 2024 ▃▃▂▃▇
athlete_id 0 1 3895089.96 1433266.37 13223 3917911 4367020 4713129 5240138 ▂▁▁▂▇
num_schools 0 1 1.17 0.41 1 1 1 1 4 ▇▁▁▁▁
head(nfl) %>% print(width = Inf)
## # A tibble: 6 × 16
##   gsis_id    espn_id primary_roster_team weeks_active multiple_teams success_v1
##   <chr>        <dbl> <chr>                      <int>          <dbl>      <dbl>
## 1 00-0031474      NA NO                            17              0          1
## 2 00-0031480      NA NYG                           17              0          1
## 3 00-0031491 2265764 HOU                           14              0          1
## 4 00-0031492 2473037 JAX                           17              0          1
## 5 00-0031493 3423412 SF                             9              0          1
## 6 00-0031502 2472364 PIT                           17              0          1
##   season full_name     position name_id       pfr_player_id draft_round
##    <dbl> <chr>         <chr>    <chr>         <chr>               <dbl>
## 1   2015 Delvin Breaux CB       delvin breaux <NA>                   NA
## 2   2015 Uani 'Unga    MLB      uani 'unga    <NA>                   NA
## 3   2015 Brian Peters  LB       brian peters  <NA>                   NA
## 4   2015 Jason Myers   K        jason myers   <NA>                   NA
## 5   2015 Jarryd Hayne  RB       jarryd hayne  <NA>                   NA
## 6   2015 Jordan Berry  P        jordan berry  <NA>                   NA
##   draft_pick   age dr_av draft_team
##        <dbl> <dbl> <dbl> <chr>     
## 1         NA    NA    NA <NA>      
## 2         NA    NA    NA <NA>      
## 3         NA    NA    NA <NA>      
## 4         NA    NA    NA <NA>      
## 5         NA    NA    NA <NA>      
## 6         NA    NA    NA <NA>
str(nfl)
## tibble [3,394 × 16] (S3: tbl_df/tbl/data.frame)
##  $ gsis_id            : chr [1:3394] "00-0031474" "00-0031480" "00-0031491" "00-0031492" ...
##  $ espn_id            : num [1:3394] NA NA 2265764 2473037 3423412 ...
##  $ primary_roster_team: chr [1:3394] "NO" "NYG" "HOU" "JAX" ...
##  $ weeks_active       : int [1:3394] 17 17 14 17 9 17 17 7 17 2 ...
##  $ multiple_teams     : num [1:3394] 0 0 0 0 0 0 0 0 0 0 ...
##  $ success_v1         : num [1:3394] 1 1 1 1 1 1 1 0 1 0 ...
##  $ season             : num [1:3394] 2015 2015 2015 2015 2015 ...
##  $ full_name          : chr [1:3394] "Delvin Breaux" "Uani 'Unga" "Brian Peters" "Jason Myers" ...
##  $ position           : chr [1:3394] "CB" "MLB" "LB" "K" ...
##  $ name_id            : chr [1:3394] "delvin breaux" "uani 'unga" "brian peters" "jason myers" ...
##  $ pfr_player_id      : chr [1:3394] NA NA NA NA ...
##  $ draft_round        : num [1:3394] NA NA NA NA NA NA 1 NA NA NA ...
##  $ draft_pick         : num [1:3394] NA NA NA NA NA NA 1 NA NA NA ...
##  $ age                : num [1:3394] NA NA NA NA NA NA 21 NA NA NA ...
##  $ dr_av              : num [1:3394] NA NA NA NA NA NA 54 NA NA NA ...
##  $ draft_team         : chr [1:3394] NA NA NA NA ...
skim(nfl)
Data summary
Name nfl
Number of rows 3394
Number of columns 16
_______________________
Column type frequency:
character 7
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gsis_id 0 1.00 10 10 0 3394 0
primary_roster_team 0 1.00 2 3 0 32 0
full_name 0 1.00 6 24 0 3380 0
position 0 1.00 1 3 0 20 0
name_id 0 1.00 6 24 0 3380 0
pfr_player_id 1381 0.59 8 8 0 2013 0
draft_team 1381 0.59 3 3 0 35 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
espn_id 923 0.73 3674771.05 705438.72 2265764 3047504 3915511 4360252 5278092 ▂▇▅▆▂
weeks_active 0 1.00 10.63 5.75 1 5 13 16 17 ▃▂▂▂▇
multiple_teams 0 1.00 0.02 0.15 0 0 0 0 1 ▇▁▁▁▁
success_v1 0 1.00 0.64 0.48 0 0 1 1 1 ▅▁▁▁▇
season 0 1.00 2019.86 3.12 2015 2017 2020 2023 2025 ▇▅▅▅▅
draft_round 1381 0.59 3.93 1.91 1 2 4 6 7 ▇▅▅▅▇
draft_pick 1381 0.59 120.80 71.77 1 59 118 179 262 ▇▇▇▇▅
age 1381 0.59 22.32 0.96 20 22 22 23 27 ▃▇▇▁▁
dr_av 1460 0.57 9.45 13.15 -1 1 5 13 106 ▇▁▁▁▁

Create drafted flag

nfl <- nfl %>% 
  mutate(drafted = ifelse(is.na(draft_round), 0, 1))

Univariate analysis

Counts for college data

college %>% count(year) %>% arrange(desc(n))
## # A tibble: 11 × 2
##     year     n
##    <dbl> <int>
##  1  2024 13562
##  2  2023  4926
##  3  2022  4890
##  4  2019  4094
##  5  2018  3986
##  6  2017  3862
##  7  2016  3419
##  8  2021  3419
##  9  2014  1952
## 10  2015  1770
## 11  2020  1548
college %>% count(team) %>% arrange(desc(n))
## # A tibble: 306 × 2
##    team               n
##    <chr>          <int>
##  1 Air Force        303
##  2 Army             285
##  3 Akron            281
##  4 BYU              278
##  5 Texas State      274
##  6 UTEP             270
##  7 UTSA             267
##  8 Navy             266
##  9 Louisiana Tech   263
## 10 Southern Miss    261
## # ℹ 296 more rows
college %>% count(conference) %>% arrange(desc(n))
## # A tibble: 44 × 2
##    conference            n
##    <chr>             <int>
##  1 Big Ten            3291
##  2 SEC                3221
##  3 ACC                3130
##  4 Conference USA     3053
##  5 American Athletic  3006
##  6 Sun Belt           2968
##  7 Mountain West      2893
##  8 Mid-American       2821
##  9 Big 12             2692
## 10 Pac-12             2001
## # ℹ 34 more rows
college %>% count(position) %>% arrange(desc(n))
## # A tibble: 19 × 2
##    position     n
##    <chr>    <int>
##  1 WR        8348
##  2 LB        6622
##  3 DB        5652
##  4 RB        5122
##  5 DL        4833
##  6 TE        2900
##  7 QB        2833
##  8 S         2472
##  9 CB        2093
## 10 PK        1695
## 11 DE        1592
## 12 DT        1090
## 13 P          969
## 14 LS         567
## 15 FB         302
## 16 OLB        106
## 17 ILB         91
## 18 NT          74
## 19 EDGE        67
college %>% count(position_group) %>% arrange(desc(n))
## # A tibble: 8 × 2
##   position_group     n
##   <chr>          <int>
## 1 DB             10217
## 2 WR              8348
## 3 DL              7656
## 4 LB              6819
## 5 RB              5424
## 6 ST              3231
## 7 TE              2900
## 8 QB              2833
college %>% count(num_schools) %>% arrange(desc(n))
## # A tibble: 4 × 2
##   num_schools     n
##         <int> <int>
## 1           1 40208
## 2           2  6556
## 3           3   643
## 4           4    21
college %>% count(side_of_ball) %>% arrange(desc(n))
## # A tibble: 3 × 2
##   side_of_ball     n
##   <chr>        <int>
## 1 Def          24692
## 2 Off          19505
## 3 ST            3231
college %>% count(div_level) %>% arrange(desc(n))
## # A tibble: 4 × 2
##   div_level     n
##   <chr>     <int>
## 1 D1 FBS    30256
## 2 D1 FCS    17058
## 3 D2          112
## 4 D3            2
college %>% count(comp_level) %>% arrange(desc(n))
## # A tibble: 6 × 2
##   comp_level     n
##   <chr>      <int>
## 1 G5         15732
## 2 P5         14550
## 3 FCS        11707
## 4 FCS Power   5325
## 5 D2           112
## 6 D3             2
college %>% count(pos_skill_flag) %>% arrange(desc(n))
## # A tibble: 3 × 2
##   pos_skill_flag     n
##   <chr>          <int>
## 1 below          20888
## 2 at             20226
## 3 above           6314

Counts for nfl data

nfl %>% count(success_v1) %>% arrange(desc(n))
## # A tibble: 2 × 2
##   success_v1     n
##        <dbl> <int>
## 1          1  2165
## 2          0  1229
nfl %>% count(primary_roster_team) %>% arrange(desc(n))
## # A tibble: 32 × 2
##    primary_roster_team     n
##    <chr>               <int>
##  1 CLE                   132
##  2 GB                    127
##  3 LA                    125
##  4 LAC                   121
##  5 IND                   120
##  6 LV                    118
##  7 NYJ                   117
##  8 JAX                   115
##  9 NYG                   114
## 10 WAS                   112
## # ℹ 22 more rows
nfl %>% count(weeks_active) %>% arrange(desc(n)) 
## # A tibble: 17 × 2
##    weeks_active     n
##           <int> <int>
##  1           16   801
##  2           17   396
##  3            1   316
##  4           14   232
##  5            2   188
##  6            3   150
##  7           15   149
##  8           13   142
##  9            4   132
## 10            7   122
## 11            5   117
## 12           12   114
## 13            9   112
## 14           10   112
## 15           11   107
## 16            6   102
## 17            8   102
nfl %>% count(multiple_teams) %>% arrange(desc(n))
## # A tibble: 2 × 2
##   multiple_teams     n
##            <dbl> <int>
## 1              0  3313
## 2              1    81
nfl %>% count(season) %>% arrange(desc(n))
## # A tibble: 11 × 2
##    season     n
##     <dbl> <int>
##  1   2016   423
##  2   2018   328
##  3   2022   328
##  4   2017   324
##  5   2020   319
##  6   2019   299
##  7   2023   297
##  8   2024   283
##  9   2025   277
## 10   2021   276
## 11   2015   240
nfl %>% count(position) %>% arrange(desc(n))
## # A tibble: 20 × 2
##    position     n
##    <chr>    <int>
##  1 DB         798
##  2 LB         576
##  3 WR         548
##  4 DL         505
##  5 RB         374
##  6 TE         238
##  7 QB         142
##  8 K           52
##  9 P           43
## 10 CB          32
## 11 DE          18
## 12 DT          14
## 13 OLB         14
## 14 ILB          9
## 15 NT           8
## 16 MLB          7
## 17 S            5
## 18 FS           4
## 19 SS           4
## 20 FB           3
nfl %>% count(draft_round) %>% arrange(desc(n))
## # A tibble: 8 × 2
##   draft_round     n
##         <dbl> <int>
## 1          NA  1381
## 2           3   323
## 3           4   315
## 4           5   313
## 5           6   286
## 6           2   278
## 7           1   272
## 8           7   226
nfl %>% count(draft_pick) %>% arrange(desc(n))
## # A tibble: 262 × 2
##    draft_pick     n
##         <dbl> <int>
##  1         NA  1381
##  2          1    11
##  3          2    11
##  4         30    11
##  5         40    11
##  6         47    11
##  7         52    11
##  8         60    11
##  9        113    11
## 10        165    11
## # ℹ 252 more rows
nfl %>% count(draft_team) %>% arrange(desc(n))
## # A tibble: 36 × 2
##    draft_team     n
##    <chr>      <int>
##  1 <NA>        1381
##  2 GNB           76
##  3 CLE           72
##  4 JAX           71
##  5 LAR           71
##  6 BAL           70
##  7 SFO           69
##  8 DAL           67
##  9 DEN           67
## 10 WAS           67
## # ℹ 26 more rows
nfl %>% count(drafted) %>% arrange(desc(n))
## # A tibble: 2 × 2
##   drafted     n
##     <dbl> <int>
## 1       1  2013
## 2       0  1381
nfl %>% count(age) %>% arrange(desc(n))
## # A tibble: 9 × 2
##     age     n
##   <dbl> <int>
## 1    NA  1381
## 2    22   760
## 3    23   650
## 4    21   397
## 5    24   167
## 6    25    24
## 7    20    11
## 8    26     3
## 9    27     1
nfl %>% count(dr_av) %>% arrange(desc(n))
## # A tibble: 76 × 2
##    dr_av     n
##    <dbl> <int>
##  1    NA  1460
##  2     0   384
##  3     1   207
##  4     2   167
##  5     3   102
##  6     5    99
##  7     4    91
##  8     6    79
##  9     7    58
## 10     8    52
## # ℹ 66 more rows

Check missing

gg_miss_var(college) +
  labs(title = "Missing values in college data")

gg_miss_var(nfl) +
  labs(title = "Missing values in nfl data")

Missing data in NFL makes sense if they were undrafted. Will need to figure out way to handle missing espn_id because that’s a lot

Join College & NFL Data

Add name_id

Make sure all OL are filtered out of both dfs

college <- college %>%
  filter(!(position %in% c("LS"))) %>% 
  mutate(name_id_college = player %>% 
           str_to_lower() %>% 
           str_replace_all("[^a-z]", ""),
         name_season_id_college = paste0(name_id_college, "-", year),
         first_name = tolower(str_extract(player, "^[^ ]+")),
         last_name = tolower(str_extract(player, "[^ ]+$")),
         athlete_id = as.character(athlete_id)) %>% 
  rename(position_college = position)

nfl <- nfl %>% 
  filter(!(position %in% c("G", "T", "C", "OL", "LS"))) %>% 
  mutate(college_year = season - 1,
         name_id = full_name %>% 
           str_to_lower() %>% 
           str_replace_all("[^a-z]", ""),
         name_season_id = paste0(name_id, "-", college_year),
         espn_id = as.character(espn_id))

check name_id counts & duplicates

college_dups <- college %>%
  count(name_id_college) %>%
  filter(n > 1) %>% 
  left_join(college, by = "name_id_college")

nfl_dups <- nfl %>% 
  count(name_id) %>% 
  filter(n > 1) %>% 
  left_join(nfl, by = "name_id")

Start with join on non-missing espn_id

merged_df_id <- nfl %>%
  filter(!is.na(espn_id)) %>%
  left_join(college %>% filter(!is.na(athlete_id)),
            by = c("espn_id" = "athlete_id")) %>% 
  filter(!is.na(position_college)) ## espn_id only works for 1294 players

## check possible incorrect matches
merged_df_id %>%
  filter(name_id != name_id_college) %>%
  select(season, name_id, name_id_college, draft_team, team) %>% 
  arrange(season)
## # A tibble: 119 × 5
##    season name_id          name_id_college    draft_team team            
##     <dbl> <chr>            <chr>              <chr>      <chr>           
##  1   2016 mikethomas       michaelthomas      LAR        Southern Miss   
##  2   2018 simmiecobbs      simmiecobbsjr      <NA>       Indiana         
##  3   2018 mikebadgley      michaelbadgley     <NA>       Miami           
##  4   2018 jeffwilson       jefferywilson      <NA>       North Texas     
##  5   2018 jakemartin       jacobmartin        SEA        Temple          
##  6   2018 olaadeniyi       olasunkanmiadeniyi <NA>       Toledo          
##  7   2018 dorancearmstrong dorancearmstrongjr DAL        Kansas          
##  8   2018 kammoore         kamrinmoore        NOR        Boston College  
##  9   2018 buddyhowell      gregoryhowelljr    <NA>       Florida Atlantic
## 10   2018 zeketurner       ezekielturner      <NA>       Washington      
## # ℹ 109 more rows
## all same player, just different version of name

Create join/match reasons

Hierarchical matching process

nfl_dups_name_season <- nfl_dups %>% 
  count(name_season_id) %>% 
  filter(n > 1) %>% 
  left_join(nfl_dups %>% select(name_id, gsis_id, espn_id, 
                                draft_team, name_season_id), 
            by = "name_season_id") 

college_dups_name_season <- college_dups %>% 
  count(name_season_id_college) %>% 
  filter(n > 1) %>% 
  left_join(college_dups %>% select(name_id_college, athlete_id, 
                                     team, name_season_id_college), 
            by = "name_season_id_college")

merged_df_id <- nfl %>%
  filter(!is.na(espn_id)) %>%
  left_join(college %>% filter(!is.na(athlete_id)),
            by = c("espn_id" = "athlete_id")) %>% 
  filter(!is.na(position_college)) %>%
  mutate(match_type = "1 - ID match")

merged_df_name <- nfl %>%
  filter(!(name_id %in% nfl_dups$name_id), 
         !(gsis_id %in% merged_df_id$gsis_id)) %>% 
  left_join(college %>% filter(!(name_id_college %in% college_dups$name_id_college)), 
            by = c("name_id" = "name_id_college")) %>% 
  filter(!is.na(position_college)) %>% 
  mutate(match_type = "2 - Name match")

merged_df_name_season <- nfl %>% 
  filter(!(gsis_id %in% merged_df_id$gsis_id),
         !(gsis_id %in% merged_df_name$gsis_id),
         !(name_season_id %in% college_dups_name_season$name_season_id_college),
         !(name_season_id %in% nfl_dups_name_season$name_season_id)) %>%
  left_join(college, by = c("name_season_id" = "name_season_id_college")) %>% 
  filter(!is.na(position_college)) %>% 
  mutate(match_type = "3 - Name-Season match") 

check_matches <- nfl %>% 
  mutate(match_type = case_when(
    gsis_id %in% merged_df_id$gsis_id ~ "1 - ID match",
    gsis_id %in% merged_df_name$gsis_id ~ "2 - Name match",
    gsis_id %in% merged_df_name_season$gsis_id ~ "3 - Name-Season match",
    TRUE ~ "0 - Unmatched"
  )) 

count(check_matches, match_type) 
## # A tibble: 4 × 2
##   match_type                n
##   <chr>                 <int>
## 1 0 - Unmatched           477
## 2 1 - ID match           1554
## 3 2 - Name match         1263
## 4 3 - Name-Season match   100

85.95% of data is matched - call that a success (for now).

Check unmatched vs success

check_matches %>% 
  select(match_type, success_v1) %>% 
  table() %>% 
  prop.table(margin = 1)
##                        success_v1
## match_type                      0         1
##   0 - Unmatched         0.4109015 0.5890985
##   1 - ID match          0.3063063 0.6936937
##   2 - Name match        0.4117181 0.5882819
##   3 - Name-Season match 0.3700000 0.6300000
check_matches %>% select(success_v1) %>% table() %>% prop.table()
## success_v1
##         0         1 
## 0.3621096 0.6378904

Combine all joined dfs

extra_cols <- Reduce(union, list(
  setdiff(names(merged_df_id), names(merged_df_name)),
  setdiff(names(merged_df_id), names(merged_df_name_season)),
  setdiff(names(merged_df_name), names(merged_df_id)),
  setdiff(names(merged_df_name), names(merged_df_name_season)),
  setdiff(names(merged_df_name_season), names(merged_df_id)),
  setdiff(names(merged_df_name_season), names(merged_df_name))
))

merged_df_id <- merged_df_id %>% 
  select(-any_of(extra_cols))
merged_df_name <- merged_df_name %>% 
  select(-any_of(extra_cols))
merged_df_name_season <- merged_df_name_season %>% 
  select(-any_of(extra_cols))

final_merged_df <- bind_rows(merged_df_id,
                             merged_df_name,
                             merged_df_name_season)

write_csv(final_merged_df, "data processing/combined_analytic.csv")

Final check for duplicates

final_merged_df %>% 
  count(gsis_id) %>% 
  filter(n > 1) 
## # A tibble: 0 × 2
## # ℹ 2 variables: gsis_id <chr>, n <int>

No duplicates!

Clean environment

rm(list = ls())

Further Data Cleaning & Preparation for Model

Read relevant data in

combined <- read_csv("data processing/combined_analytic.csv")
team_wp <- read_csv("data processing/nfl_team_win_percentage.csv")

Drop unnecessary columns

combined <- combined %>% 
  select(-c("espn_id", "name_id", "pfr_player_id",
           "name_season_id", "year", "player", "first_name", "last_name", 
           "match_type"))

Check missing

gg_miss_var(combined) +
  labs(title = "Missing values in analytic file data")

gg_miss_var(team_wp) +
  labs(title = "Missing values in Team WP data")

Change draft_team to abbreviations to match primary_roster_team

combined <- combined %>% 
  mutate(draft_team = case_when(
    draft_team == "GNB" ~ "GB",
    draft_team == "DEN" ~ "DEN",
    draft_team == "CAR" ~ "CAR",
    draft_team == "BAL" ~ "BAL", 
    draft_team == "SFO" ~ "SF",
    draft_team == "WAS" ~ "WAS",
    draft_team == "HOU" ~ "HOU",
    draft_team == "ATL" ~ "ATL", 
    draft_team == "PHI" ~ "PHI",
    draft_team == "IND" ~ "IND",
    draft_team == "DET" ~ "DET",
    draft_team == "DAL" ~ "DAL",
    draft_team == "SEA" ~ "SEA",
    draft_team == "JAX" ~ "JAX",
    draft_team == "PIT" ~ "PIT",
    draft_team == "CLE" ~ "CLE",
    draft_team == "KAN" ~ "KC",
    draft_team == "NOR" ~ "NO",
    draft_team == "TEN" ~ "TEN",
    draft_team == "ARI" ~ "ARI",
    draft_team == "MIN" ~ "MIN",
    draft_team == "LAR" ~ "LA",
    draft_team == "TAM" ~ "TB",
    draft_team == "LVR" ~ "LV",
    draft_team == "LAC" ~ "LAC",
    draft_team == "NYG" ~ "NYG",
    draft_team == "CHI" ~ "CHI",
    draft_team == "BUF" ~ "BUF",
    draft_team == "CIN" ~ "CIN",
    draft_team == "NYJ" ~ "NYJ",
    draft_team == "MIA" ~ "MIA",
    draft_team == "NWE" ~ "NE",
    draft_team == "SDG" ~ "LAC",
    draft_team == "STL" ~ "LA",
    draft_team == "OAK" ~ "LV",
    TRUE ~ draft_team
  ))

Change NA to Undrafted

combined <- combined %>% 
  mutate(draft_team = ifelse(is.na(draft_team), "UDFA", draft_team),
         draft_round = ifelse(is.na(draft_round), "Undrafted", draft_round))

Create binary different position and changed team variables

combined <- combined %>% 
  mutate(different_position = ifelse(position != position_college, 1, 0),
         changed_teams = ifelse(primary_roster_team != draft_team, 1, 0))

Create draft round categories

combined <- combined %>% 
  mutate(draft_round_cat = case_when(
    draft_round %in% c(1) ~ "Round 1",
    draft_round %in% c(2,3) ~ "Rounds 2-3",
    draft_round %in% c(4,5,6,7) ~ "Rounds 4-7",
    TRUE ~ "Undrafted"
  ))

Add draft & roster team previous winning percentage as variable

combined <- combined %>% 
  mutate(prev_season = season - 1) %>% 
  left_join(team_wp %>% select(team, season, win_percentage),
            by = c("draft_team" = "team", "prev_season" = "season")) %>%
  rename(draft_team_wp_prev = win_percentage) %>%
  left_join(team_wp %>% select(team, season, win_percentage), 
            by = c("primary_roster_team" = "team", "prev_season" = "season")) %>% 
  rename(roster_team_wp_prev = win_percentage) %>% 
  select(-prev_season)

count(combined, is.na(roster_team_wp_prev)) 
## # A tibble: 1 × 2
##   `is.na(roster_team_wp_prev)`     n
##   <lgl>                        <int>
## 1 FALSE                         2917

All players have a primary roster team winning percentage so can use that without any concerns.

Save data for modelling process

write_csv(combined, "data processing/full_data.csv")

rm(team_wp)

Variable Analysis

Basic histograms

ggplot(combined, aes(x = num_schools)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Number of Schools Attended",
       x = "Number of Schools",
       y = "Count of Players") +
  theme_minimal()

ggplot(combined, aes(x = position_group)) +
  geom_bar(stat = "count", fill = "skyblue", color = "black") +
  labs(title = "Distribution of College Positions",
       x = "Position Group",
       y = "Count of Players") +
  theme_minimal()

ggplot(combined, aes(x = as.factor(success_v1))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Basic Success for Rookies",
       x = "Active for more than half the season (1) or not (0)",
       y = "Count of Players") +
  theme_minimal()

Univariate Analysis for All Numeric Variables

numeric_vars <- combined %>%
  select(where(is.numeric)) %>%
  names()

for (var in numeric_vars) {
  p <- ggplot(combined, aes_string(x = var)) +
    geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
    labs(title = paste("Distribution of", var),
         x = var,
         y = "Count of Players") +
    theme_minimal()
  print(p)
}

Bivariate Analysis

Modify Data

combined_for_analysis <- combined %>% 
  filter(season != 2025) %>%
  select(-c("gsis_id",  "full_name")) %>% 
  mutate(primary_roster_team = as.factor(primary_roster_team),
         multiple_teams = as.factor(multiple_teams),
         success_v1 = as.factor(success_v1),
         num_schools = as.factor(num_schools),
         side_of_ball = as.factor(side_of_ball),
         position_group = as.factor(position_group),
         conference = as.factor(conference),
         div_level = as.factor(div_level) %>% relevel("D1 FBS"),
         comp_level = as.factor(comp_level) %>% relevel("P5"),
         pos_skill_flag = as.factor(pos_skill_flag),
         position = as.factor(position),
         position_college = as.factor(position_college),
         different_position = as.factor(different_position),
         drafted = as.factor(drafted),
         draft_team = as.factor(draft_team),
         changed_teams = as.factor(changed_teams),
         draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
         draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
  ) 

Pull relevant variable names

bivariate_cols <- colnames(combined_for_analysis)

numeric_cols <- combined_for_analysis %>% 
  select(all_of(bivariate_cols)) %>% 
  select(where(is.numeric)) %>% 
  colnames()

cat_cols <- combined_for_analysis %>% 
  select(all_of(bivariate_cols)) %>% 
  select(!any_of(numeric_cols)) %>% 
  colnames()

Numeric Variables

combined_for_analysis %>% 
  group_by(season) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = season, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(college_year) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = college_year, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(draft_pick) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = draft_pick, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(age) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = age, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(dr_av) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = dr_av, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(draft_team_wp_prev) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = draft_team_wp_prev, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

combined_for_analysis %>% 
  group_by(roster_team_wp_prev) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = roster_team_wp_prev, y = success_rate)) +
  geom_point() +
  geom_smooth() +
  theme_classic()

*Note: didn’t look at weeks_active vs success_v1 because success_v1 is calculated from weeks_active.

Categorical vars

ggplot(combined_for_analysis, aes(x = primary_roster_team, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = multiple_teams, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = changed_teams, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = different_position, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = position, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = position_college, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = position_group, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = pos_skill_flag, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = draft_round, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = draft_round_cat, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = draft_team, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = drafted, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = team, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = conference, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = div_level, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = comp_level, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

ggplot(combined_for_analysis, aes(x = num_schools, fill = success_v1)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion") +
  theme_classic() +
  guides(x = guide_axis(angle = 90))

Improved Graphs for Analysis in Presentation

Distribution of Weeks Active

ggplot(combined_for_analysis, aes(x = weeks_active)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 8.5, 
             color = "darkblue", linetype = "dashed", linewidth = 1) +
  labs(title = "Distribution of Weeks Active",
       subtitle = "For NFL Rookies",
       x = "Weeks on Active Roster",
       y = "Count of Players") +
  theme(plot.subtitle = element_text(size = 5)) +
  theme_minimal()

Draft Status & Success Rate

prop_df <- combined_for_analysis %>%
  count(drafted, success_v1) %>%
  group_by(drafted) %>%
  mutate(prop = n / sum(n),
         prop_round = paste0(round(prop, 3) * 100, "%")) 

ggplot(prop_df, aes(x = factor(drafted), y = prop, fill = factor(success_v1))) +
  geom_col(position = "fill") +
  geom_text(aes(label = ifelse(success_v1 == 1, prop_round, "")),
    position = position_stack(vjust = 0.9),
    color = "black",
    size = 3.5
  ) +
  scale_fill_manual(
    name = "Successful?",
    values = c("#D3D3D3", "#87CEEB"),      
    labels = c("No", "Yes")
  ) +
  labs(
    x = "Drafted (1 = Yes)",
    y = "Proportion",
    title = "NFL Rookie Draft Status and Success Rate") +
  theme_minimal()

Changed Teams & Success Rate

ggplot(combined_for_analysis, aes(x = changed_teams, fill = success_v1)) +
  geom_bar(position = "fill") +
  scale_fill_manual(
    name = "Successful?",
    values = c("#D3D3D3", "#87CEEB"),      
    labels = c("No", "Yes")
  ) +
  labs(
    x = "Changed Teams (1 = Yes)",
    y = "Proportion",
    title = "NFL Rookies Switching Teams and Success Rate",
    subtitle = "Have a different primary roster team from their draft team") +
  theme(plot.subtitle = element_text(size = 5)) +
  theme_minimal()

Age vs Success Rate

combined_for_analysis %>% 
  group_by(age) %>% 
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = age, y = success_rate)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Age (at draft)",
       y = "Success Rate",
       title = "Player Age vs Average Success Rate") +
  theme(plot.subtitle = element_text(size = 5)) +
  theme_classic()

combined_for_analysis %>% 
  group_by(age) %>% 
  filter(age < 26) %>%
  summarize(success_rate = mean(success_v1 == 1), 
            n = n()) %>%
  ggplot(aes(x = age, y = success_rate)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Age (at draft)",
       y = "Success Rate",
       title = "Player Age vs Average Success Rate",
       subtitle = "After Dropping Rookies Older than 25") +
  theme(plot.subtitle = element_text(size = 5)) +
  theme_classic()

Clean Environment

rm(list = ls())

Model 1: Random Forest

Read data & prepare for model

data <- read_csv("data processing/full_data.csv")

model_data <- data %>% 
  filter(season != 2025) %>%
  select(-c("gsis_id", "weeks_active", "full_name", "position",
            "college_year", "season", "team", "conference", 
            "position_college", "draft_pick", "draft_team_wp_prev", 
            "draft_team")) %>% 
  mutate(primary_roster_team = as.factor(primary_roster_team),
         multiple_teams = as.factor(multiple_teams),
         success_v1 = as.factor(success_v1),
         num_schools = as.factor(num_schools),
         side_of_ball = as.factor(side_of_ball),
         position_group = as.factor(position_group),
         div_level = as.factor(div_level) %>% relevel("D1 FBS"),
         comp_level = as.factor(comp_level) %>% relevel("P5"),
         pos_skill_flag = as.factor(pos_skill_flag),
         different_position = as.factor(different_position),
         drafted = as.factor(drafted),
         changed_teams = as.factor(changed_teams),
         draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
         draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
  )

Variable Imputation

model_data <- model_data %>% 
  mutate(dr_av_missing = as.factor(ifelse(is.na(dr_av), 1, 0)),
         age_missing = as.factor(ifelse(is.na(age), 1, 0)))

avg_dr_av_pos_round <- model_data %>% 
  group_by(position_group, draft_round) %>% 
  summarize(avg_dr_av = mean(dr_av, na.rm = TRUE), .groups = "drop") %>% 
  mutate(avg_dr_av = round(avg_dr_av))
avg_dr_av_pos <- model_data %>% 
  group_by(position_group) %>% 
  summarize(avg_dr_av = mean(dr_av, na.rm = TRUE), .groups = "drop") %>% 
  mutate(avg_dr_av = round(avg_dr_av))

avg_age <- model_data %>% 
  summarize(avg_age = mean(age, na.rm = TRUE), .groups = "drop") %>% 
  mutate(avg_age = round(avg_age)) %>% 
  pull()

model_data_imp <- model_data %>%
  left_join(avg_dr_av_pos_round, by = c("position_group", "draft_round"), 
            suffix = c("", "_pos_round")) %>%
  left_join(avg_dr_av_pos, by = "position_group",
            suffix = c("", "_pos"))

model_data_imp <- model_data_imp %>%
  mutate(dr_av_imputed = if_else(is.na(dr_av),
                                 coalesce(.data$avg_dr_av, .data$avg_dr_av_pos),
                                 dr_av)) %>% 
  select(-avg_dr_av, -avg_dr_av_pos)

model_data_imp <- model_data_imp %>% 
  mutate(dr_av = dr_av_imputed,
         age = ifelse(is.na(age), avg_age, age)) %>% 
  select(-dr_av_imputed) %>% 
  filter(age != 26, age!= 27)

save(avg_dr_av_pos, avg_dr_av_pos_round, avg_age, 
     file = "data processing/avg_imputation.Rdata")
rm(avg_dr_av_pos, avg_dr_av_pos_round, avg_age)

Get data directly ready for Random Forest

Drop factor levels with low sample size and remove multicollinear variables
rf_model_data <- model_data_imp %>%
  mutate(across(where(is.factor),
                ~ fct_lump_min(.x, min = 20))) %>%
  select(-c("age_missing", "changed_teams", "dr_av_missing", "drafted",
            "draft_round_cat", "position_group"))

gg_miss_var(rf_model_data) 

Split for train/test

set.seed(22)
split = sample.split(rf_model_data$success_v1, SplitRatio = 0.8) 

Base Random Forest

set.seed(22)
rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
                   ntree = 501,
                   importance = TRUE)

print(rf)
## 
## Call:
##  randomForest(formula = success_v1 ~ ., data = rf_model_data[split,      ], ntree = 501, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 501
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.71%
## Confusion matrix:
##     0    1 class.error
## 0 374  367   0.4952767
## 1 218 1152   0.1591241
plot(rf)

rf_class <- predict(rf, rf_model_data[!split,], type = "response")
fscore <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
                          positive = "1")$byClass["F1"]

rf$importance
##                                 0             1 MeanDecreaseAccuracy
## primary_roster_team  0.0095610687 -0.0020081069         2.064763e-03
## multiple_teams      -0.0013777336  0.0007205110        -1.524129e-05
## draft_round          0.1567007927  0.0298231898         7.429152e-02
## age                  0.0268718131 -0.0070111733         4.833753e-03
## dr_av                0.0740806418 -0.0071368106         2.135590e-02
## num_schools          0.0035957443  0.0006025060         1.665904e-03
## side_of_ball         0.0084920100  0.0018649977         4.187462e-03
## div_level           -0.0017517056  0.0012444771         1.945249e-04
## comp_level           0.0006484189  0.0011648535         9.680349e-04
## pos_skill_flag       0.0020695524 -0.0016032128        -3.214563e-04
## different_position   0.0049387605 -0.0001451474         1.650934e-03
## roster_team_wp_prev  0.0128688407 -0.0071043682        -1.022610e-04
##                     MeanDecreaseGini
## primary_roster_team       221.632969
## multiple_teams              6.847433
## draft_round               185.247973
## age                        38.769534
## dr_av                     132.836073
## num_schools                16.940535
## side_of_ball               27.407014
## div_level                   6.344361
## comp_level                 28.415333
## pos_skill_flag             35.613748
## different_position         19.467025
## roster_team_wp_prev        94.884539
varImpPlot(rf)

OOB Error is 27.71% and F-score of 0.785.

Calculate best value for mtry parameter

m <- seq(2, 12)
fscore.seq <- numeric()

for(i in 1:length(m)){ 
  set.seed(22)
  rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
                     ntree = 501,
                     importance = TRUE,
                     mtry = m[i]) 
  
  rf_class <- predict(rf, newdata = rf_model_data[!split,], type = "response")
  fscore.seq[i] <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
                                   positive = "1")$byClass["F1"]
}

plot(m, fscore.seq, pch = 19 , col="blue", type = "b",
     ylab = "F-score", xlab = "Number of Predictors considered at each split")

m_best <- m[which.max(fscore.seq)] 
rm(fscore.seq, i, m)

Calculate best value for ntree parameter

ntrees <- c(101, 251, 501, 751, 1001)
fscore.seq <- numeric()

for(i in seq_along(ntrees)){ 
  set.seed(22)
  rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
                     ntree = ntrees[i],
                     importance = TRUE,
                     mtry = m_best) 
  
  rf_class <- predict(rf, newdata = rf_model_data[!split,], type = "response")
  fscore.seq[i] <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
                                   positive = "1")$byClass["F1"]
}

plot(ntrees, fscore.seq, pch = 19 , col="blue", type = "b",
     ylab = "F-score", xlab = "Number of Trees used in Random Forest")

ntrees_best <- ntrees[which.max(fscore.seq)] 
rm(ntrees, fscore.seq)

Create final random forest with optimal parameters

set.seed(22)
rf_final <- randomForest(success_v1 ~., data = rf_model_data[split,],
                         ntree = ntrees_best, mtry = m_best, importance = TRUE)

print(rf_final) 
## 
## Call:
##  randomForest(formula = success_v1 ~ ., data = rf_model_data[split,      ], ntree = ntrees_best, mtry = m_best, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 501
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 27.66%
## Confusion matrix:
##     0    1 class.error
## 0 365  376   0.5074224
## 1 208 1162   0.1518248
plot(rf_final)

rf_final_class <- predict(rf_final, newdata = rf_model_data[!split,], 
                          type = "response")

confusionMatrix(rf_final_class, rf_model_data$success_v1[!split])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  97  57
##          1  88 286
##                                          
##                Accuracy : 0.7254         
##                  95% CI : (0.6852, 0.763)
##     No Information Rate : 0.6496         
##     P-Value [Acc > NIR] : 0.0001223      
##                                          
##                   Kappa : 0.3725         
##                                          
##  Mcnemar's Test P-Value : 0.0127254      
##                                          
##             Sensitivity : 0.5243         
##             Specificity : 0.8338         
##          Pos Pred Value : 0.6299         
##          Neg Pred Value : 0.7647         
##              Prevalence : 0.3504         
##          Detection Rate : 0.1837         
##    Detection Prevalence : 0.2917         
##       Balanced Accuracy : 0.6791         
##                                          
##        'Positive' Class : 0              
## 
fscore <- confusionMatrix(rf_final_class, rf_model_data$success_v1[!split],
                          positive = "1")$byClass["F1"]

rf_final$importance
##                                 0             1 MeanDecreaseAccuracy
## primary_roster_team  0.0119026592 -3.236297e-03         2.076146e-03
## multiple_teams      -0.0010087311  4.531214e-04        -5.454426e-05
## draft_round          0.1613960468  2.768474e-02         7.436701e-02
## age                  0.0231650582 -7.049128e-03         3.543891e-03
## dr_av                0.0797720558 -9.357569e-03         2.178815e-02
## num_schools          0.0045641444 -5.612973e-05         1.563106e-03
## side_of_ball         0.0083911465  2.494246e-03         4.557601e-03
## div_level           -0.0013153925  9.716571e-04         1.674719e-04
## comp_level           0.0030798867  6.172516e-04         1.492653e-03
## pos_skill_flag      -0.0001094845 -8.088085e-04        -5.821274e-04
## different_position   0.0050036070 -6.170856e-04         1.362034e-03
## roster_team_wp_prev  0.0143345011 -7.327268e-03         2.717310e-04
##                     MeanDecreaseGini
## primary_roster_team       241.841521
## multiple_teams              7.151242
## draft_round               196.427442
## age                        37.118685
## dr_av                     143.217668
## num_schools                18.059708
## side_of_ball               31.197995
## div_level                   7.035268
## comp_level                 32.236374
## pos_skill_flag             41.250450
## different_position         22.809933
## roster_team_wp_prev       112.637920
varImpPlot(rf_final)

OOB Error is 27.66% and F-score of 0.798, showing slight improvement. Accuracy (72.54%) > NIR (64.96%)

Select most important features

var_imp <- rf_final$importance
top_features <- rownames(var_imp[order(var_imp[, "MeanDecreaseAccuracy"], 
                                       decreasing = TRUE), ])[1:8]

Check for multicollinearity using Cramers V

cramers_df <- combn(top_features, 2, simplify = FALSE) %>%
  map_dfr(~ data.frame(
    var1 = .x[1],
    var2 = .x[2],
    cramersV = cramersV(
      rf_model_data[[.x[1]]],
      rf_model_data[[.x[2]]]
    )
  ))

cramers_df <- cramers_df %>% 
  filter(cramersV > 0.6)
appearance_counts <- cramers_df %>%
  pivot_longer(cols = c(var1, var2),
               names_to = "side",
               values_to = "variable") %>%
  count(variable, sort = TRUE)

cramers_df
## [1] var1     var2     cramersV
## <0 rows> (or 0-length row.names)
appearance_counts
## # A tibble: 0 × 2
## # ℹ 2 variables: variable <chr>, n <int>

Save Random Forest model, clean environment

save(rf_model_data, m_best, ntrees_best, rf_final, top_features, split, 
     file = "data processing/RF_results.Rdata")

rm(list = ls())

Model 2: Logistic Regression

Load important objects

load("data processing/RF_results.Rdata")

imp_formula <- as.formula(paste("success_v1 ~", 
                                paste(top_features, collapse = " + ")))

Base Model

set.seed(22)
imp_model <- glm(imp_formula, family = binomial, data = rf_model_data[split,])
summary(imp_model)
## 
## Call:
## glm(formula = imp_formula, family = binomial, data = rf_model_data[split, 
##     ])
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             2.01754    1.97609   1.021  0.30727    
## draft_round1            2.45722    0.27990   8.779  < 2e-16 ***
## draft_round2            2.86060    0.28367  10.084  < 2e-16 ***
## draft_round3            3.19032    0.26549  12.017  < 2e-16 ***
## draft_round4            2.52897    0.22529  11.225  < 2e-16 ***
## draft_round5            1.97794    0.20470   9.663  < 2e-16 ***
## draft_round6            2.03173    0.21447   9.473  < 2e-16 ***
## draft_round7            1.48468    0.22170   6.697 2.13e-11 ***
## dr_av                   0.07057    0.01240   5.689 1.27e-08 ***
## side_of_ballOff        -0.10664    0.11933  -0.894  0.37153    
## side_of_ballST          1.63365    0.33504   4.876 1.08e-06 ***
## age                    -0.12776    0.08776  -1.456  0.14545    
## primary_roster_teamATL -0.14039    0.50614  -0.277  0.78149    
## primary_roster_teamBAL -1.21847    0.44903  -2.714  0.00666 ** 
## primary_roster_teamBUF -1.08238    0.49047  -2.207  0.02733 *  
## primary_roster_teamCAR -0.35968    0.45737  -0.786  0.43163    
## primary_roster_teamCHI -0.33592    0.46942  -0.716  0.47424    
## primary_roster_teamCIN -0.08117    0.45455  -0.179  0.85827    
## primary_roster_teamCLE -0.46133    0.44313  -1.041  0.29784    
## primary_roster_teamDAL -0.05129    0.45719  -0.112  0.91068    
## primary_roster_teamDEN -0.67843    0.44594  -1.521  0.12818    
## primary_roster_teamDET -1.06850    0.45896  -2.328  0.01991 *  
## primary_roster_teamGB  -0.51799    0.43200  -1.199  0.23051    
## primary_roster_teamHOU -0.09955    0.47386  -0.210  0.83361    
## primary_roster_teamIND -0.77103    0.44339  -1.739  0.08204 .  
## primary_roster_teamJAX -0.91320    0.43406  -2.104  0.03539 *  
## primary_roster_teamKC   0.08289    0.50197   0.165  0.86883    
## primary_roster_teamLA   0.21756    0.44347   0.491  0.62372    
## primary_roster_teamLAC -0.52243    0.43782  -1.193  0.23277    
## primary_roster_teamLV  -0.21396    0.44254  -0.483  0.62875    
## primary_roster_teamMIA -0.28732    0.45612  -0.630  0.52875    
## primary_roster_teamMIN -0.48250    0.44796  -1.077  0.28144    
## primary_roster_teamNE  -0.52072    0.46853  -1.111  0.26640    
## primary_roster_teamNO  -0.57211    0.46241  -1.237  0.21600    
## primary_roster_teamNYG -0.11842    0.44879  -0.264  0.79188    
## primary_roster_teamNYJ -0.19157    0.45844  -0.418  0.67603    
## primary_roster_teamPHI -0.30919    0.46232  -0.669  0.50363    
## primary_roster_teamPIT -0.50761    0.51742  -0.981  0.32657    
## primary_roster_teamSEA -0.37580    0.46627  -0.806  0.42026    
## primary_roster_teamSF  -0.19117    0.46209  -0.414  0.67908    
## primary_roster_teamTB  -0.13320    0.46512  -0.286  0.77459    
## primary_roster_teamTEN -0.68994    0.45957  -1.501  0.13329    
## primary_roster_teamWAS -0.18134    0.43870  -0.413  0.67935    
## num_schools2           -0.19437    0.16186  -1.201  0.22981    
## num_schoolsOther       -1.54158    0.84311  -1.828  0.06748 .  
## comp_levelFCS           0.24103    0.27321   0.882  0.37765    
## comp_levelFCS Power    -0.05548    0.29594  -0.187  0.85130    
## comp_levelG5           -0.11964    0.13142  -0.910  0.36262    
## different_position1     0.02404    0.12954   0.186  0.85277    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2736.2  on 2110  degrees of freedom
## Residual deviance: 2109.6  on 2062  degrees of freedom
## AIC: 2207.6
## 
## Number of Fisher Scoring iterations: 6
vif(imp_model) ## ALL UNDER 3
##                         GVIF Df GVIF^(1/(2*Df))
## draft_round         2.047566  7        1.052522
## dr_av               1.516863  1        1.231610
## side_of_ball        1.288500  2        1.065421
## age                 1.319426  1        1.148663
## primary_roster_team 1.358856 31        1.004958
## num_schools         1.081476  2        1.019775
## comp_level          1.162246  3        1.025376
## different_position  1.206517  1        1.098415
lr_probs <- predict(imp_model, rf_model_data[!split,], type = "response")
lr_class <- ifelse(lr_probs > 0.5, 1, 0)
lr_misc <- 1 - mean(lr_class == rf_model_data$success_v1[!split]) 
lr_cm <- table(rf_model_data[!split,]$success_v1, lr_class)

tp <- lr_cm["1", "1"]
fp <- lr_cm["0", "1"]
fn <- lr_cm["1", "0"]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)

f1 <- 2 * precision * recall / (precision + recall) 
rm(tp, fp, fn)

All VIFs under 2.5, Misclassification rate of 29.55%, Accuracy of 70.45%, F1 of 0.7672.

Stepwise selection

full <- glm(imp_formula, family = binomial, data = rf_model_data[split,])
null <- glm(success_v1 ~ 1, family = binomial, data = rf_model_data[split,])

step_imp_model <- step(null, scope = formula(full), 
                       direction = "both", k = 2)
## Start:  AIC=2738.17
## success_v1 ~ 1
## 
##                       Df Deviance    AIC
## + draft_round          7   2227.8 2243.8
## + dr_av                1   2690.4 2694.4
## + comp_level           3   2702.1 2710.1
## + num_schools          2   2720.7 2726.7
## + side_of_ball         2   2728.9 2734.9
## <none>                     2736.2 2738.2
## + age                  1   2736.1 2740.1
## + different_position   1   2736.2 2740.2
## + primary_roster_team 31   2699.6 2763.6
## 
## Step:  AIC=2243.75
## success_v1 ~ draft_round
## 
##                       Df Deviance    AIC
## + dr_av                1   2189.8 2207.8
## + side_of_ball         2   2206.4 2226.4
## + age                  1   2222.2 2240.2
## + num_schools          2   2221.7 2241.7
## <none>                     2227.8 2243.8
## + different_position   1   2226.8 2244.8
## + comp_level           3   2225.7 2247.7
## + primary_roster_team 31   2191.2 2269.2
## - draft_round          7   2736.2 2738.2
## 
## Step:  AIC=2207.77
## success_v1 ~ draft_round + dr_av
## 
##                       Df Deviance    AIC
## + side_of_ball         2   2159.6 2181.6
## + num_schools          2   2184.4 2206.4
## + age                  1   2187.2 2207.2
## <none>                     2189.8 2207.8
## + different_position   1   2187.8 2207.8
## + comp_level           3   2187.6 2211.6
## + primary_roster_team 31   2151.4 2231.4
## - dr_av                1   2227.8 2243.8
## - draft_round          7   2690.4 2694.4
## 
## Step:  AIC=2181.59
## success_v1 ~ draft_round + dr_av + side_of_ball
## 
##                       Df Deviance    AIC
## + num_schools          2   2154.2 2180.2
## + age                  1   2157.2 2181.2
## <none>                     2159.6 2181.6
## + different_position   1   2159.5 2183.5
## + comp_level           3   2157.6 2185.6
## + primary_roster_team 31   2119.9 2203.9
## - side_of_ball         2   2189.8 2207.8
## - dr_av                1   2206.4 2226.4
## - draft_round          7   2678.8 2686.8
## 
## Step:  AIC=2180.25
## success_v1 ~ draft_round + dr_av + side_of_ball + num_schools
## 
##                       Df Deviance    AIC
## <none>                     2154.2 2180.2
## + age                  1   2152.5 2180.5
## - num_schools          2   2159.6 2181.6
## + different_position   1   2154.2 2182.2
## + comp_level           3   2152.2 2184.2
## + primary_roster_team 31   2113.8 2201.8
## - side_of_ball         2   2184.4 2206.4
## - dr_av                1   2200.5 2224.5
## - draft_round          7   2666.1 2678.1
summary(step_imp_model)
## 
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball + 
##     num_schools, family = binomial, data = rf_model_data[split, 
##     ])
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.19426    0.15248  -7.832 4.80e-15 ***
## draft_round1      2.43614    0.27270   8.934  < 2e-16 ***
## draft_round2      2.81228    0.27750  10.134  < 2e-16 ***
## draft_round3      3.07013    0.25931  11.840  < 2e-16 ***
## draft_round4      2.43049    0.21501  11.304  < 2e-16 ***
## draft_round5      1.89147    0.19369   9.766  < 2e-16 ***
## draft_round6      1.92003    0.20249   9.482  < 2e-16 ***
## draft_round7      1.39396    0.20695   6.736 1.63e-11 ***
## dr_av             0.07051    0.01216   5.798 6.70e-09 ***
## side_of_ballOff  -0.11520    0.10891  -1.058   0.2901    
## side_of_ballST    1.56227    0.32549   4.800 1.59e-06 ***
## num_schools2     -0.16948    0.15663  -1.082   0.2792    
## num_schoolsOther -1.54079    0.82608  -1.865   0.0622 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2736.2  on 2110  degrees of freedom
## Residual deviance: 2154.2  on 2098  degrees of freedom
## AIC: 2180.2
## 
## Number of Fisher Scoring iterations: 6

Forward selection

fwd_imp_model <- step(null, scope = formula(full), 
                      direction = "forward", k = 2, trace = FALSE)
summary(fwd_imp_model)
## 
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball + 
##     num_schools, family = binomial, data = rf_model_data[split, 
##     ])
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.19426    0.15248  -7.832 4.80e-15 ***
## draft_round1      2.43614    0.27270   8.934  < 2e-16 ***
## draft_round2      2.81228    0.27750  10.134  < 2e-16 ***
## draft_round3      3.07013    0.25931  11.840  < 2e-16 ***
## draft_round4      2.43049    0.21501  11.304  < 2e-16 ***
## draft_round5      1.89147    0.19369   9.766  < 2e-16 ***
## draft_round6      1.92003    0.20249   9.482  < 2e-16 ***
## draft_round7      1.39396    0.20695   6.736 1.63e-11 ***
## dr_av             0.07051    0.01216   5.798 6.70e-09 ***
## side_of_ballOff  -0.11520    0.10891  -1.058   0.2901    
## side_of_ballST    1.56227    0.32549   4.800 1.59e-06 ***
## num_schools2     -0.16948    0.15663  -1.082   0.2792    
## num_schoolsOther -1.54079    0.82608  -1.865   0.0622 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2736.2  on 2110  degrees of freedom
## Residual deviance: 2154.2  on 2098  degrees of freedom
## AIC: 2180.2
## 
## Number of Fisher Scoring iterations: 6

Backward

bwd_imp_model <- step(full, direction = "backward", k = 2, trace = FALSE)
summary(bwd_imp_model)
## 
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball + 
##     num_schools, family = binomial, data = rf_model_data[split, 
##     ])
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.19426    0.15248  -7.832 4.80e-15 ***
## draft_round1      2.43614    0.27270   8.934  < 2e-16 ***
## draft_round2      2.81228    0.27750  10.134  < 2e-16 ***
## draft_round3      3.07013    0.25931  11.840  < 2e-16 ***
## draft_round4      2.43049    0.21501  11.304  < 2e-16 ***
## draft_round5      1.89147    0.19369   9.766  < 2e-16 ***
## draft_round6      1.92003    0.20249   9.482  < 2e-16 ***
## draft_round7      1.39396    0.20695   6.736 1.63e-11 ***
## dr_av             0.07051    0.01216   5.798 6.70e-09 ***
## side_of_ballOff  -0.11520    0.10891  -1.058   0.2901    
## side_of_ballST    1.56227    0.32549   4.800 1.59e-06 ***
## num_schools2     -0.16948    0.15663  -1.082   0.2792    
## num_schoolsOther -1.54079    0.82608  -1.865   0.0622 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2736.2  on 2110  degrees of freedom
## Residual deviance: 2154.2  on 2098  degrees of freedom
## AIC: 2180.2
## 
## Number of Fisher Scoring iterations: 6

Plot variable importance of all logistic regression models for comparison

get_varimp_df <- function(model, model_name) {
  coef_df <- tidy(model, conf.int = TRUE) %>% 
    filter(term != "(Intercept)") %>% 
    mutate(odds_ratio = exp(estimate), 
           conf.low = exp(conf.low), 
           conf.high = exp(conf.high),
           model = model_name)
}

varimp_df <- bind_rows(
  get_varimp_df(imp_model, "Full"),
  get_varimp_df(step_imp_model, "Stepwise"),
  get_varimp_df(fwd_imp_model, "Forward"),
  get_varimp_df(bwd_imp_model, "Backward")
)

ggplot(varimp_df,
       aes(x = reorder(term, odds_ratio), y = odds_ratio)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  facet_wrap(~ model, scales = "free_y") +
  labs(
    title = "Variable Importance Across Logistic Regression Models",
    x = "Feature",
    y = "Odds Ratio"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(face = "bold"),
    axis.text.y = element_text(size = 9)
  )

Select Final Logistic Regression Model

final_reg_model <- step_imp_model

summary(final_reg_model)
## 
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball + 
##     num_schools, family = binomial, data = rf_model_data[split, 
##     ])
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.19426    0.15248  -7.832 4.80e-15 ***
## draft_round1      2.43614    0.27270   8.934  < 2e-16 ***
## draft_round2      2.81228    0.27750  10.134  < 2e-16 ***
## draft_round3      3.07013    0.25931  11.840  < 2e-16 ***
## draft_round4      2.43049    0.21501  11.304  < 2e-16 ***
## draft_round5      1.89147    0.19369   9.766  < 2e-16 ***
## draft_round6      1.92003    0.20249   9.482  < 2e-16 ***
## draft_round7      1.39396    0.20695   6.736 1.63e-11 ***
## dr_av             0.07051    0.01216   5.798 6.70e-09 ***
## side_of_ballOff  -0.11520    0.10891  -1.058   0.2901    
## side_of_ballST    1.56227    0.32549   4.800 1.59e-06 ***
## num_schools2     -0.16948    0.15663  -1.082   0.2792    
## num_schoolsOther -1.54079    0.82608  -1.865   0.0622 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2736.2  on 2110  degrees of freedom
## Residual deviance: 2154.2  on 2098  degrees of freedom
## AIC: 2180.2
## 
## Number of Fisher Scoring iterations: 6
vif(final_reg_model) ## ALL UNDER 2, only 4 vars
##                  GVIF Df GVIF^(1/(2*Df))
## draft_round  1.514226  7        1.030080
## dr_av        1.492082  1        1.221508
## side_of_ball 1.054453  2        1.013344
## num_schools  1.011185  2        1.002785
rm(list = setdiff(ls(), c("final_reg_model", "rf_model_data", "split")))

Final predictions & assessment

final_probs <- predict(final_reg_model, rf_model_data[!split,], type = "response")
pred_class <- ifelse(final_probs > 0.5, 1, 0)
misc_rate <- 1 - mean(pred_class == rf_model_data$success_v1[!split])
cm <- table(rf_model_data[!split,]$success_v1, pred_class)

tp <- cm["1", "1"]
fp <- cm["0", "1"]
fn <- cm["1", "0"]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)

f1 <- 2 * precision * recall / (precision + recall) 
rm(tp, fp, fn)

Misclassification rate of 27.84%, Accuracy of 72.16%, F1 of 0.7

Save model for future application

save(final_reg_model, file = "data processing/future predictions.Rdata")

Check model assumptions

plot(final_reg_model)

cooks_d <- cooks.distance(final_reg_model)
cutoff <- 4 / nrow(rf_model_data[split, ])
num_influential <- sum(cooks_d > cutoff, na.rm = TRUE)
print(paste0("Number of Influential Observations (out of 2111): ", 
             num_influential))
## [1] "Number of Influential Observations (out of 2111): 116"
influencePlot(final_reg_model)

##         StudRes         Hat       CookD
## 16   -3.1478306 0.002529918 0.023604327
## 36   -3.2479772 0.002069272 0.026183771
## 957  -0.9018721 0.159614000 0.007655024
## 1857  1.3666086 0.175580064 0.024471876
## 2021  2.2393826 0.066601427 0.048042772

Not great, but logistic regression is more robust so linearity isn’t as important. I also don’t have any truly continuous variables, so linearity vs log-odds doesn’t need to be checked. Still definitely a limitation of this project.

Probability Distributions

prob_df <- data.frame(
  success_v1 = rf_model_data$success_v1[!split],
  lr_probs = final_probs
)

ggplot(prob_df, aes(x = final_probs)) +
  geom_density(fill = "#2E86AB") +
  labs(
    title = "Distribution of Predicted Rookie Success Probabilities",
    x = "Predicted Probability",
    y = "Count"
  ) +
  theme_minimal(base_size = 13)

ggplot(prob_df, aes(x = final_probs, fill = success_v1)) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c("0" = "#B0B0B0", "1" = "#2E86AB")) +
  labs(
    title = "Predicted Probabilities by Actual Outcome",
    subtitle = "Success (1) vs Not Success (0)",
    x = "Predicted Probability",
    fill = ""
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "top"
  )

Check Feature/Variable Importance and Coefficients

std_coefs <- standardize_parameters(final_reg_model)
ggplot(std_coefs,
       aes(x = reorder(Parameter, abs(Std_Coefficient)),
           y = Std_Coefficient)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Standardized Coefficients (Relative Importance)",
    x = "Feature",
    y = "Standardized Effect"
  ) +
  theme_minimal()

coef_df <- tidy(final_reg_model, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    odds_ratio = exp(estimate),
    conf.low = exp(conf.low),
    conf.high = exp(conf.high)
  )

ggplot(coef_df,
       aes(x = reorder(term, odds_ratio), y = odds_ratio)) +
  geom_point(size = 2) +
  geom_errorbar(
    aes(ymin = conf.low, ymax = conf.high),
    width = 0.2
  ) +
  geom_hline(yintercept = 1, linetype = "dashed") +
  coord_flip() +
  labs(
    title = "Logistic Regression Variable Importance",
    x = "Feature",
    y = "Odds Ratio"
  ) +
  theme_minimal()

Compare Categorical Predictors

(for fun)

prob_df_full <- data.frame(
  draft_round = rf_model_data$draft_round[!split],
  dr_av = rf_model_data$dr_av[!split],
  side_of_ball = rf_model_data$side_of_ball[!split],
  num_schools = rf_model_data$num_schools[!split],
  final_probs = final_probs
)

ggplot(prob_df_full, aes(x = draft_round, y = side_of_ball, 
                         fill = final_probs)) +
  geom_tile(color = "black") + 
  labs(x = "Draft Round Category", y = "Side of Ball", 
       fill = "Success Probability") +
  theme_classic()

ggplot(prob_df_full, aes(x = draft_round, y = num_schools, fill = final_probs)) +
  geom_tile(color = "black") + 
  labs(x = "Draft Round Category", y = "Number of Colleges", 
       fill = "Success Probability") +
  theme_classic()

ggplot(prob_df_full, aes(x = num_schools, y = side_of_ball, fill = final_probs)) +
  geom_tile(color = "black") + 
  labs(x = "Number of Colleges", y = "Side of Ball", 
       fill = "Success Probability") +
  theme_classic()

### Clean Environment

rm(list = ls())

Validate/Apply to 2025 Rookies

Read & Prepare data same as model prep

data <- read_csv("data processing/full_data.csv")

data_for_val <- data %>% 
  filter(season == 2025) %>% 
  mutate(primary_roster_team = as.factor(primary_roster_team),
         multiple_teams = as.factor(multiple_teams),
         success_v1 = as.factor(success_v1),
         num_schools = as.factor(num_schools),
         side_of_ball = as.factor(side_of_ball),
         position_group = as.factor(position_group),
         div_level = as.factor(div_level) %>% relevel("D1 FBS"),
         comp_level = as.factor(comp_level) %>% relevel("P5"),
         pos_skill_flag = as.factor(pos_skill_flag),
         different_position = as.factor(different_position),
         drafted = as.factor(drafted),
         changed_teams = as.factor(changed_teams),
         draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
         draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
  )

load("data processing/future predictions.Rdata")
load("data processing/avg_imputation.Rdata")

data_for_val <- data_for_val %>% 
  mutate(dr_av_missing = as.factor(ifelse(is.na(dr_av), 1, 0)),
         age_missing = as.factor(ifelse(is.na(age), 1, 0)))

data_for_val <- data_for_val %>%
  left_join(avg_dr_av_pos_round, by = c("position_group", "draft_round"), 
            suffix = c("", "_pos_round")) %>%
  left_join(avg_dr_av_pos, by = "position_group",
            suffix = c("", "_pos"))

data_for_val <- data_for_val %>%
  mutate(dr_av_imputed = if_else(is.na(dr_av),
                                 coalesce(.data$avg_dr_av, .data$avg_dr_av_pos),
                                 dr_av)) %>% 
  select(-avg_dr_av, -avg_dr_av_pos)

data_for_val <- data_for_val %>% 
  mutate(dr_av = dr_av_imputed,
         age = ifelse(is.na(age), avg_age, age)) %>% 
  select(-dr_av_imputed)

gg_miss_var(data_for_val) ## 2 vars, but not being used in my analysis

2 missing variables, but they’re not predictors in the model, so I’m not concerned.

Make predictions for 2025

data_for_val$num_schools <- factor(
  data_for_val$num_schools,
  levels = levels(model.frame(final_reg_model)$num_schools)
)

pred_probs_25 <- predict(final_reg_model, data_for_val, 
                         type = "response")

pred_success_25 <- ifelse(pred_probs_25 > 0.5, 1, 0)

results <- data_for_val %>%
  mutate(pred_prob = pred_probs_25)

Probability Distribution for 2025 data

ggplot(results, aes(x = pred_prob)) +
  geom_density(fill = "#4C72B0") +
  labs(
    title = "Distribution of Predicted Rookie Success Probabilities",
    x = "Predicted Probability",
    y = "Count"
  ) +
  theme_minimal(base_size = 13)

There are 14 players who predicted NA (they have NA num_schools), which I believe became NA when releveled to match model factors -> they’ve attended more than 2 colleges, which is weird but a growing trend in cfb, so definitely worth investigating in the future.

Check Rookies Who Have Already Succeeded

checks <- results %>% 
  filter(!is.na(pred_prob)) %>% 
  mutate(pred_success = ifelse(pred_prob > 0.5, 1, 0),
         hit_success_wk14 = ifelse(pred_success == 1 & weeks_active > 8, 1, 0),
         miss_success_wk14 = ifelse(pred_success == 1 & weeks_active < 6, 1, 0),
         hit_fail_wk14 = ifelse(pred_success == 0 & weeks_active < 6, 1, 0),
         miss_fail_wk14 = ifelse(pred_success == 0 & weeks_active > 8, 1, 0),
         tbd_success_wk14 = ifelse(pred_success == 1 & weeks_active < 9 & 
                                     weeks_active > 5, 1, 0),
         tbd_fail_wk14 = ifelse(pred_success == 0 & weeks_active > 5 &
                                  weeks_active < 9, 1, 0)) %>% 
  select(primary_roster_team, weeks_active, full_name, position, team, 
         draft_round, dr_av, side_of_ball, num_schools, pred_prob, pred_success,
         hit_success_wk14, miss_success_wk14, hit_fail_wk14, miss_fail_wk14,
         tbd_success_wk14, tbd_fail_wk14) %>% 
  mutate(hit_success_wk14 = as.factor(hit_success_wk14),
         miss_success_wk14 = as.factor(miss_success_wk14),
         hit_fail_wk14 = as.factor(hit_fail_wk14),
         miss_fail_wk14 = as.factor(miss_fail_wk14),
         tbd_success_wk14 = as.factor(tbd_success_wk14),
         tbd_fail_wk14 = as.factor(tbd_fail_wk14))

ggplot(checks, aes(x = pred_success, y = tbd_success_wk14, fill = pred_prob)) +
  geom_tile(color = "black") + 
  labs(x = "Predicted Success", y = "Still Possible to Succeed", 
       fill = "Success Probability") +
  theme_classic()

count(checks, tbd_success_wk14)
## # A tibble: 2 × 2
##   tbd_success_wk14     n
##   <fct>            <int>
## 1 0                  234
## 2 1                   26

Not entirely certain of the code logic, but definitely something of interest.

Check “Cool Cases”

max_prob <- results %>% 
  arrange(desc(pred_prob)) %>% 
  slice(1)

min_prob <- results %>% 
  arrange(pred_prob) %>% 
  slice(1)

cool_cases <- results %>% 
  filter(full_name == max_prob$full_name |
           full_name == min_prob$full_name |
           full_name == "Shedeur Sanders" |
           full_name == "Jalen Milroe" |
           full_name == "Darius Cooper") %>% 
  select(full_name, pred_prob, weeks_active, draft_round, side_of_ball, dr_av,
         num_schools, dr_av_missing) %>% 
  arrange(desc(pred_prob)) 


cool_cases_tbl <- cool_cases %>%
  mutate(
    pred_prob = percent(pred_prob, accuracy = 0.1),
    draft_round = as.character(draft_round),
    dr_av_missing = ifelse(dr_av_missing == "1", "Yes", "No")
  )

cool_cases_tbl %>%
  gt() %>%
  gt_theme_pff() %>% 
  cols_label(
    full_name = "Player",
    pred_prob = "Predicted Probability",
    weeks_active = "Weeks Active",
    draft_round = "Draft Round",
    side_of_ball = "Side of Ball",
    dr_av = "Draft AV",
    dr_av_missing = "Draft AV Imputed?",
    num_schools = "# Colleges"
  ) %>%
  fmt_markdown(columns = full_name) %>%
  tab_header(
    title = "'Cool Case' Player Predictions",
  ) %>%
  tab_source_note(
    source_note = "Predictions from Logistic Regression Model"
  )
'Cool Case' Player Predictions
Player Predicted Probability Weeks Active Draft Round Side of Ball Draft AV # Colleges Draft AV Imputed?
Tyler Loop 90.8% 14 6 ST 0 1 No
Jalen Milroe 85.3% 4 3 Off 0 1 No
Shedeur Sanders 60.2% 8 5 Off 0 2 No
Darius Cooper 35.3% 10 Undrafted Off 10 1 Yes
Ben Yurosek 27.2% 9 Undrafted Off 7 2 Yes
Predictions from Logistic Regression Model